#############################################################################
# project: Prioritizing Exceptional Social Needs 
# file: 03_data_prep_and_merge.R 
# author: Michael Jankowski, Brian Dietrich
# task: Data preparation for conjoint analysis and merging data from stata dataset
# input: four_surveys, survey_data.csv
# last revision: 2023/07/3
# output: pooled_df, pooled_df2
############################################################################

library(ebal)
library(cobalt)

### prepate "four_surveys" data frame with prep-function

pooled_df <- purrr::map_df(four_surveys, function(x) prep_data(x))
pooled_df$Sample <- as.factor(pooled_df$Sample)


### merge relevant variables from the survey Data set.
C2 <- data.table::fread("../data/survey_data/survey_data.csv",
                        data.table = F)

C2_merg <- subset(C2, select=c(v9, student, sector, age_group, female, v184_ger, west, lr_3))

C2_merg <- C2_merg %>% 
  rename(
    respondentIndex = v9
  )

pooled_df2 <-merge(pooled_df,C2_merg, by="respondentIndex", all.x=T)


# reareange position of factors
pooled_df2$student  <- factor(pooled_df2$student, levels = c("Public Administration", "General Student Sample","Social Work"))
pooled_df2$sector         <- factor(pooled_df2$sector,        levels = c("Private Sector", "Public Sector"))
pooled_df2$v184_ger      <- factor(pooled_df2$v184_ger,      levels = c("Ja", "Nein"))
pooled_df2$west          <- factor(pooled_df2$west,          levels = c("east", "west"))
pooled_df2$female        <- factor(pooled_df2$female,        levels = c("male", "female"))
pooled_df2$age_group     <- factor(pooled_df2$age_group,     levels = c("18-34", "35-54", "55 and older"))
pooled_df2$lr_3          <- factor(pooled_df2$lr_3,          levels = c("left", "middle", "right"))




# rename labels for conjoint plots
label(pooled_df2$Composition) <- "Household\nComposition" 
label(pooled_df2$Unemployment) <- "Reason for\nUnemployment" 
label(pooled_df2$Since) <- "Duration of\nUnemployment"
label(pooled_df2$Supportive) <- "Supportive\nBehavior"
label(pooled_df2$Education) <- "Education\nLevel"
label(pooled_df2$Citizenship) <- "Nationality"


### Transform Variables for interaction


# new binary Variable with German / Non-German
pooled_df2$Citizenship_bin <- factor(ifelse(grepl("Germany", pooled_df2$Citizenship),
                                            "Native", "Foreign"))

# new binary variable with Kids / No kids
pooled_df2$kid <- factor(ifelse(grepl("Child", pooled_df2$Composition),
                                "Kids", "No Kids"))


### New combined Variable for German/Foreign & Kids/No Kids


attach(pooled_df2)
pooled_df2$Children[Citizenship_bin=="Native" & kid=="Kids"] <- "Native x Kid(s)"
pooled_df2$Children[Citizenship_bin=="Native" & kid=="No Kids"] <- "Native x No Kid(s)"
pooled_df2$Children[Citizenship_bin=="Foreign" & kid=="Kids"] <- "Foreign x Kid(s)"
pooled_df2$Children[Citizenship_bin=="Foreign" & kid=="No Kids"] <- "Foreign x No Kid(s)"
table(pooled_df2$Children)
pooled_df2$Children <- factor(pooled_df2$Children, levels = c("Foreign x No Kid(s)",       
                                                              "Native x No Kid(s)",
                                                              "Foreign x Kid(s)",
                                                              "Native x Kid(s)" ))
detach(pooled_df2)


label(pooled_df2$Children) <- "Kid(s)\nx\nNationality" 
 

### Prepare Date for entrophy balancing 

match_data <- filter(pooled_df2, grepl("civic", Sample))

match_data$D <- as.numeric(match_data$sector == "Public Sector")

match_data <- filter(match_data, !is.na(D) & task == 1)

table(match_data$D) # 333 = Public Sector, 1285= Private Sector  
match_data$young <- as.numeric(match_data$age_group == "18-34")
match_data$mid_age <- as.numeric(match_data$age_group == "35-54")
match_data$gender <- as.numeric(match_data$female == "female")
match_data$nongerman <- as.numeric(match_data$v184_ger == "Ja")
match_data$eastgerman <- as.numeric(match_data$west == "east")
match_data$left <- as.numeric(match_data$lr_3 == "left")
match_data$center <- as.numeric(match_data$lr_3 == "middle")

X = match_data[,c("respondentIndex",
                  "D",
                  "young",
                  "mid_age", 
                  "gender",
                  "eastgerman",
                  "left",
                  "center")]

X = na.omit(X)

balance_vars <-  X[,-c(1,2)]
treatment <- X$D

ebalance(Treatment = treatment, 
         X = balance_vars) -> ewei

X$ebal <- 1

X$ebal[X$D == 0] <- ewei$w


bal.tab(balance_vars,
        treat = treatment,
        weights = X$ebal,
        disp = c("means", "sds"),
        un = TRUE)

ebalw <- X[,c("respondentIndex", "ebal")]

pooled_df2 <- left_join(pooled_df2,
                        ebalw)

match_cj <- filter(pooled_df2, !is.na(ebal))




################################# End ###################################################
